Fraud_Detection on Transactions#

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
data = pd.read_csv("fraud_detection.csv")
display(data.head())
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 277 CASH_OUT 135823.38 C1593233399 143871.00 8047.62 C1539414805 3200782.20 3336605.59 0 0
1 207 CASH_OUT 288639.74 C798341710 584.00 0.00 C1082976131 3005558.14 3294197.88 0 0
2 196 TRANSFER 553399.25 C1967632347 553399.25 0.00 C1971557254 0.00 0.00 1 0
3 79 TRANSFER 1619331.51 C144675030 1619331.51 0.00 C660991644 0.00 0.00 1 0
4 251 CASH_OUT 254768.53 C2070917948 5142.00 0.00 C1290691221 103461.62 358230.15 0 0

Normalizing the Dataset#

import sqlite3
from sqlite3 import Error

def create_connection(db_file, delete_db=False):
    import os
    
    if delete_db and os.path.exists(db_file):
        os.remove(db_file)

    conn = None
    try:
        conn = sqlite3.connect(db_file)
        conn.execute("PRAGMA foreign_keys = 1")
    except Error as e:
        print(e)

    return conn


def create_table(conn, create_table_sql, drop_table_name=None):
    
    if drop_table_name: # You can optionally pass drop_table_name to drop the table. 
        try:
            c = conn.cursor()
            c.execute("""DROP TABLE IF EXISTS %s""" % (drop_table_name))
        except Error as e:
            print(e)
    
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def execute_sql_statement(sql_statement, conn):
    cur = conn.cursor()
    cur.execute(sql_statement)

    rows = cur.fetchall()

    return rows
def create_schema_and_load_data(conn, data_file):
    # Create tables
    transaction_types_sql = """
    CREATE TABLE IF NOT EXISTS transaction_types (
        type_id INTEGER PRIMARY KEY AUTOINCREMENT,
        type_name TEXT NOT NULL UNIQUE
    );"""
    
    accounts_sql = """
    CREATE TABLE IF NOT EXISTS accounts (
        account_id TEXT PRIMARY KEY,
        current_balance REAL DEFAULT 0.0
    );"""
    
    transactions_sql = """
    CREATE TABLE IF NOT EXISTS transactions (
        transaction_id INTEGER PRIMARY KEY,
        step INTEGER NOT NULL,
        type_id INTEGER,
        amount REAL NOT NULL,
        origin_account TEXT,
        destination_account TEXT,
        is_fraud INTEGER CHECK (is_fraud IN (0, 1)),
        is_flagged_fraud INTEGER CHECK (is_flagged_fraud IN (0, 1)),
        FOREIGN KEY (type_id) REFERENCES transaction_types(type_id),
        FOREIGN KEY (origin_account) REFERENCES accounts(account_id),
        FOREIGN KEY (destination_account) REFERENCES accounts(account_id)
    );"""
    
    balance_history_sql = """
    CREATE TABLE IF NOT EXISTS balance_history (
        balance_id INTEGER PRIMARY KEY AUTOINCREMENT,
        account_id TEXT,
        transaction_id INTEGER,
        old_balance REAL,
        new_balance REAL,
        is_destination INTEGER CHECK (is_destination IN (0, 1)),
        FOREIGN KEY (account_id) REFERENCES accounts(account_id),
        FOREIGN KEY (transaction_id) REFERENCES transactions(transaction_id)
    );"""
    
    # Create all tables
    create_table(conn, transaction_types_sql)
    create_table(conn, accounts_sql)
    create_table(conn, transactions_sql)
    create_table(conn, balance_history_sql)
    
    cursor = conn.cursor()
    
    # First, insert transaction types and store their IDs
    transaction_types = {
        'CASH_OUT': None,
        'CASH_IN': None,
        'TRANSFER': None,
        'PAYMENT': None
    }
    
    # Insert transaction types and get their IDs
    for type_name in transaction_types.keys():
        cursor.execute("INSERT OR IGNORE INTO transaction_types (type_name) VALUES (?)", (type_name,))
        cursor.execute("SELECT type_id FROM transaction_types WHERE type_name = ?", (type_name,))
        transaction_types[type_name] = cursor.fetchone()[0]
    
    conn.commit()
    
    # Read and process the data file
    with open(data_file, 'r') as f:
        # Skip header if it exists
        header = f.readline()
        
        for line in f:
            try:
                # Parse the CSV line
                fields = [field.strip() for field in line.strip().split(',')]
                if len(fields) != 11:  # Validate number of fields
                    print(f"Skipping invalid line: {line}")
                    continue
                
                step = int(fields[0])
                type_name = fields[1].strip()
                amount = float(fields[2])
                origin_account = fields[3]
                origin_old_balance = float(fields[4])
                origin_new_balance = float(fields[5])
                dest_account = fields[6]
                dest_old_balance = float(fields[7])
                dest_new_balance = float(fields[8])
                is_fraud = int(fields[9])
                is_flagged_fraud = int(fields[10])
                
                # Get type_id from our dictionary
                type_id = transaction_types.get(type_name)
                if type_id is None:
                    continue
                
                # Insert accounts if they don't exist
                cursor.execute("INSERT OR IGNORE INTO accounts (account_id) VALUES (?)", (origin_account,))
                if dest_account.startswith(('C', 'M')):  # Valid account
                    cursor.execute("INSERT OR IGNORE INTO accounts (account_id) VALUES (?)", (dest_account,))
                
                # Insert transaction
                cursor.execute("""
                    INSERT INTO transactions 
                    (step, type_id, amount, origin_account, destination_account, is_fraud, is_flagged_fraud)
                    VALUES (?, ?, ?, ?, ?, ?, ?)
                """, (step, type_id, amount, origin_account, dest_account, is_fraud, is_flagged_fraud))
                
                transaction_id = cursor.lastrowid
                
                # Insert balance history for origin account
                cursor.execute("""
                    INSERT INTO balance_history 
                    (account_id, transaction_id, old_balance, new_balance, is_destination)
                    VALUES (?, ?, ?, ?, 0)
                """, (origin_account, transaction_id, origin_old_balance, origin_new_balance))
                
                # Insert balance history for destination account
                if dest_account.startswith(('C', 'M')):  # Valid account
                    cursor.execute("""
                        INSERT INTO balance_history 
                        (account_id, transaction_id, old_balance, new_balance, is_destination)
                        VALUES (?, ?, ?, ?, 1)
                    """, (dest_account, transaction_id, dest_old_balance, dest_new_balance))
                
                # Update current balances
                cursor.execute("UPDATE accounts SET current_balance = ? WHERE account_id = ?", 
                             (origin_new_balance, origin_account))
                if dest_account.startswith(('C', 'M')):
                    cursor.execute("UPDATE accounts SET current_balance = ? WHERE account_id = ?", 
                                 (dest_new_balance, dest_account))
                
                if (cursor.rowcount % 1000) == 0:
                    conn.commit()
                    
            except Exception as e:
                print(f"Error processing line: {line}")
                print(f"Error details: {str(e)}")
                continue
    
    conn.commit()
    print("Data loading completed successfully!")
conn = create_connection("fraud_detection.db", delete_db=True)
create_schema_and_load_data(conn, "fraud_detection.csv")
Data loading completed successfully!
import pandas as pd

def fetch_fraud_data(conn):
    """
    Fetch fraud detection data from normalized database tables and return as a pandas DataFrame.
    
    Args:
        conn: SQLite database connection
        
    Returns:
        pandas.DataFrame: Combined fraud detection data
    """
    
    query = """
    SELECT 
        t.step,
        tt.type_name as type,
        t.amount,
        t.origin_account as nameOrig,
        orig_bal.old_balance as oldbalanceOrg,
        orig_bal.new_balance as newbalanceOrig,
        t.destination_account as nameDest,
        dest_bal.old_balance as oldbalanceDest,
        dest_bal.new_balance as newbalanceDest,
        t.is_fraud as isFraud,
        t.is_flagged_fraud as isFlaggedFraud
    FROM transactions t
    JOIN transaction_types tt ON t.type_id = tt.type_id
    JOIN balance_history orig_bal ON t.transaction_id = orig_bal.transaction_id 
        AND t.origin_account = orig_bal.account_id 
        AND orig_bal.is_destination = 0
    LEFT JOIN balance_history dest_bal ON t.transaction_id = dest_bal.transaction_id 
        AND t.destination_account = dest_bal.account_id 
        AND dest_bal.is_destination = 1
    ORDER BY t.step;
    """
    
    # Read directly into pandas DataFrame
    df = pd.read_sql_query(query, conn)
    
    # Fill any NULL values in balance columns with 0.0
    balance_cols = ['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
    df[balance_cols] = df[balance_cols].fillna(0.0)
    
    return df

conn = create_connection("fraud_detection.db")
df = fetch_fraud_data(conn)
display(df)
step type amount nameOrig oldbalanceOrg newbalanceOrig nameDest oldbalanceDest newbalanceDest isFraud isFlaggedFraud
0 1 CASH_OUT 132842.64 C13692003 4499.08 0.00 C297927961 0.00 132842.64 1 0
1 1 TRANSFER 181.00 C1305486145 181.00 0.00 C553264065 0.00 0.00 1 0
2 1 CASH_OUT 416001.33 C749981943 0.00 0.00 C667346055 102.00 9291619.62 1 0
3 1 PAYMENT 1915.43 C822087264 11450.00 9534.57 M30699728 0.00 0.00 0 0
4 1 CASH_OUT 181.00 C840083671 181.00 0.00 C38997010 21182.00 0.00 1 0
... ... ... ... ... ... ... ... ... ... ... ...
24510 743 TRANSFER 6311409.28 C1529008245 6311409.28 0.00 C1881841831 0.00 0.00 1 0
24511 743 CASH_OUT 6311409.28 C1162922333 6311409.28 0.00 C1365125890 68488.84 6379898.11 1 0
24512 743 TRANSFER 850002.52 C1685995037 850002.52 0.00 C2080388513 0.00 0.00 1 0
24513 743 CASH_OUT 850002.52 C1280323807 850002.52 0.00 C873221189 6510099.11 7360101.63 1 0
24514 743 CASH_OUT 339682.13 C786484425 339682.13 0.00 C776919290 0.00 339682.13 1 0

24515 rows × 11 columns

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

def explore_fraud_data(conn):
    """
    Comprehensive exploration of fraud detection dataset
    """
    # Fetch data
    df = fetch_fraud_data(conn)
    
    print("Dataset Shape:", df.shape)
    print("\nData Types:")
    print(df.dtypes)
    
    # Basic statistics
    print("\nBasic Statistics:")
    print(df.describe())
    
    # Class distribution
    print("\nFraud Distribution:")
    print(df['isFraud'].value_counts(normalize=True))
    
    # Distribution by transaction type
    fraud_by_type = pd.crosstab(df['type'], df['isFraud'], normalize='index')
    print("\nFraud Rate by Transaction Type:")
    print(fraud_by_type)
    
    # Correlation matrix
    plt.figure(figsize=(12, 8))
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    correlation = df[numeric_cols].corr()
    sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Distribution plots for numeric features
    numeric_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                       'oldbalanceDest', 'newbalanceDest']
    
    plt.figure(figsize=(15, 10))
    for i, feature in enumerate(numeric_features, 1):
        plt.subplot(2, 3, i)
        sns.histplot(data=df, x=feature, hue='isFraud', bins=50, alpha=0.5)
        plt.title(f'{feature} Distribution')
        plt.yscale('log')
        plt.xscale('log')
    plt.tight_layout()
    plt.show()
    
    # Box plots for amount by transaction type
    plt.figure(figsize=(12, 6))
    sns.boxplot(data=df, x='type', y='amount')
    plt.xticks(rotation=45)
    plt.title('Transaction Amount by Type')
    plt.yscale('log')
    plt.tight_layout()
    plt.show()
    
    # Check for missing values
    print("\nMissing Values:")
    print(df.isnull().sum())
    
    # Check for zero values
    print("\nZero Values Count:")
    zero_counts = (df[numeric_features] == 0).sum()
    print(zero_counts)
    
    # Analyze potential anomalies
    print("\nPotential Anomalies:")
    for col in numeric_features:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        outliers = df[col][(df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))]
        print(f"\n{col}:")
        print(f"Number of outliers: {len(outliers)}")
        print(f"Min: {df[col].min()}")
        print(f"Max: {df[col].max()}")
    
    return {
        'df': df,
        'fraud_rate': df['isFraud'].mean(),
        'fraud_by_type': fraud_by_type,
        'correlation': correlation
    }

def stratification_analysis(df):
    """
    Analyze whether stratification is needed and by which variables
    """
    print("\nStratification Analysis:")
    
    # Check class imbalance
    print("\nClass Distribution:")
    print(df['isFraud'].value_counts(normalize=True))
    
    # Check fraud rate by transaction type
    print("\nFraud Rate by Transaction Type:")
    fraud_by_type = df.groupby('type')['isFraud'].agg(['count', 'mean'])
    print(fraud_by_type)
    
    # Chi-square test for independence between type and fraud
    contingency = pd.crosstab(df['type'], df['isFraud'])
    chi2, p_value = stats.chi2_contingency(contingency)[:2]
    print(f"\nChi-square test p-value: {p_value}")
    
    # Amount quartiles analysis
    df['amount_quartile'] = pd.qcut(df['amount'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
    print("\nFraud Rate by Amount Quartile:")
    print(df.groupby('amount_quartile')['isFraud'].mean())
    
    return {
        'chi2_p_value': p_value,
        'fraud_by_type': fraud_by_type
    }

def create_cleanup_tasks(df):
    """
    Create list of data cleanup tasks based on exploration
    """
    cleanup_tasks = []
    
    # Check for missing values
    if df.isnull().sum().sum() > 0:
        cleanup_tasks.append("Handle missing values")
    
    # Check for zero values in important columns
    zero_values = (df[['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                      'oldbalanceDest', 'newbalanceDest']] == 0).sum()
    if zero_values.sum() > 0:
        cleanup_tasks.append("Investigate zero values in numeric columns")
    
    # Check for negative values
    neg_values = (df[['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                     'oldbalanceDest', 'newbalanceDest']] < 0).sum()
    if neg_values.sum() > 0:
        cleanup_tasks.append("Handle negative values")
    
    # Check balance consistency
    balance_mismatch = df[df['oldbalanceOrg'] < df['newbalanceOrig']]
    if len(balance_mismatch) > 0:
        cleanup_tasks.append("Investigate balance inconsistencies")
    
    # Check for extreme values
    for col in ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                'oldbalanceDest', 'newbalanceDest']:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        outliers = df[col][(df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))]
        if len(outliers) > 0:
            cleanup_tasks.append(f"Handle outliers in {col}")
    
    return cleanup_tasks

conn = create_connection("fraud_detection.db")
results = explore_fraud_data(conn)
strat_results = stratification_analysis(results['df'])
cleanup_tasks = create_cleanup_tasks(results['df'])

print("\nRecommended Data Cleanup Tasks:")
for i, task in enumerate(cleanup_tasks, 1):
    print(f"{i}. {task}")
Dataset Shape: (24515, 11)

Data Types:
step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

Basic Statistics:
               step        amount  oldbalanceOrg  newbalanceOrig  \
count  24515.000000  2.451500e+04   2.451500e+04    2.451500e+04   
mean     285.437569  6.152231e+05   1.101861e+06    6.289189e+05   
std      181.064011  1.617177e+06   3.137237e+06    2.647180e+06   
min        1.000000  0.000000e+00   0.000000e+00    0.000000e+00   
25%      159.000000  2.399208e+04   5.725000e+02    0.000000e+00   
50%      265.000000  1.333361e+05   5.651100e+04    0.000000e+00   
75%      380.500000  3.582903e+05   4.975262e+05    2.193430e+04   
max      743.000000  3.188675e+07   5.958504e+07    4.958504e+07   

       oldbalanceDest  newbalanceDest       isFraud  isFlaggedFraud  
count    2.451500e+04    2.451500e+04  24515.000000    24515.000000  
mean     9.181891e+05    1.252046e+06      0.335019        0.000653  
std      3.606397e+06    4.000389e+06      0.472007        0.025539  
min      0.000000e+00    0.000000e+00      0.000000        0.000000  
25%      0.000000e+00    0.000000e+00      0.000000        0.000000  
50%      0.000000e+00    1.551389e+05      0.000000        0.000000  
75%      6.633293e+05    1.113139e+06      1.000000        0.000000  
max      2.362305e+08    2.367265e+08      1.000000        1.000000  

Fraud Distribution:
isFraud
0    0.664981
1    0.335019
Name: proportion, dtype: float64

Fraud Rate by Transaction Type:
isFraud          0         1
type                        
CASH_IN   1.000000  0.000000
CASH_OUT  0.585081  0.414919
PAYMENT   1.000000  0.000000
TRANSFER  0.244793  0.755207
_images/c88da33aae6e694521de50de2ea516794fdeeaa5d85fc028c9425d1a508223c5.png _images/7bff958868623f08acaaae4a1a5bab6442d16836bbef9898f874d76c68c8c9bd.png _images/0fa00cbf78078ddc088ee652f2a0bbb6bb9d018acd08272e88df1f884637f353.png
Missing Values:
step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

Zero Values Count:
amount               16
oldbalanceOrg      5463
newbalanceOrig    17323
oldbalanceDest    12299
newbalanceDest    10400
dtype: int64

Potential Anomalies:

amount:
Number of outliers: 3376
Min: 0.0
Max: 31886745.04

oldbalanceOrg:
Number of outliers: 4146
Min: 0.0
Max: 59585040.37

newbalanceOrig:
Number of outliers: 5355
Min: 0.0
Max: 49585040.37

oldbalanceDest:
Number of outliers: 3396
Min: 0.0
Max: 236230516.82

newbalanceDest:
Number of outliers: 2958
Min: 0.0
Max: 236726494.66

Stratification Analysis:

Class Distribution:
isFraud
0    0.664981
1    0.335019
Name: proportion, dtype: float64

Fraud Rate by Transaction Type:
          count      mean
type                     
CASH_IN    3582  0.000000
CASH_OUT   9920  0.414919
PAYMENT    5588  0.000000
TRANSFER   5425  0.755207

Chi-square test p-value: 0.0

Fraud Rate by Amount Quartile:
amount_quartile
Q1    0.094469
Q2    0.251754
Q3    0.267134
Q4    0.726709
Name: isFraud, dtype: float64

Recommended Data Cleanup Tasks:
1. Investigate zero values in numeric columns
2. Investigate balance inconsistencies
3. Handle outliers in amount
4. Handle outliers in oldbalanceOrg
5. Handle outliers in newbalanceOrig
6. Handle outliers in oldbalanceDest
7. Handle outliers in newbalanceDest
/var/folders/st/ypkxfxmd1j9br2vhgr1pbdch0000gn/T/ipykernel_25004/780226841.py:114: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.
  print(df.groupby('amount_quartile')['isFraud'].mean())
from ydata_profiling import ProfileReport

# Generate the profile report
profile = ProfileReport(data, title="Data Profiling Report", explorative=True)

# Save the report as an HTML file
profile.to_file("data_profiling_report.html")
profile.to_notebook_iframe()

Observations from Exploratory Data Analysis (EDA):

  1. General Data Overview:

The dataset contains multiple features related to transactions, such as amount, oldbalanceOrg, newbalanceOrig, oldbalanceDest, and newbalanceDest. There is a binary target variable isFraud indicating whether a transaction is fraudulent.

  1. Distribution of isFraud:

The dataset is highly imbalanced, with fraudulent transactions making up a small percentage of the total data. This indicates the need for resampling techniques or stratification during model training.

  1. Transaction Type Analysis:

Certain transaction types (TRANSFER and CASH_OUT) have higher fraud rates compared to others (PAYMENT, DEBIT, and CASH_IN). TRANSFER and CASH_OUT transactions likely involve significant amounts, making them a common target for fraudulent activities.

  1. Correlation Matrix Insights:

High correlation between oldbalanceOrg and newbalanceOrig indicates that the balances are closely linked, which aligns with the nature of balance updates during transactions. Low correlation between amount and other features suggests that the transaction amount varies independently of other balance-related features.

  1. Numeric Feature Distributions:

The distribution of amount and balance features (oldbalanceOrg, newbalanceOrig, etc.) is heavily skewed, with a few large transactions creating long tails. Log scaling the distributions reveals clearer patterns and reduces skewness, improving visibility into smaller transactions.

  1. Capped Values and Anomalies:

Some features show potential capping at certain values, indicating system-imposed limits. These capped values may obscure genuine patterns and need closer examination. Outliers are present in transaction amounts and balances, with extreme values possibly representing fraudulent activities or system errors.

Logistic Regression#

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import mlflow
from mlflow.models import infer_signature  # Added this import
import os
import warnings
warnings.filterwarnings('ignore')

# MLflow configuration
MLFLOW_TRACKING_URI = "https://dagshub.com/charankonduru2003/fraud_detection.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'charankonduru2003'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '0b22e21e2cfc7080ebc0fc65157efbff55fc501e'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("fraud_detection_experiment")

def log_transform(X):
    """Apply log transformation to numeric data, handling zeros and negative values"""
    return np.log1p(np.abs(X))

def prepare_data(df):
    # Separate features and target
    X = df.drop(['isFlaggedFraud', 'isFraud'], axis=1)
    y = df['isFraud']
    
    # Split categorical and numerical columns
    categorical_cols = ['type', 'nameOrig', 'nameDest']
    numerical_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                     'oldbalanceDest', 'newbalanceDest', 'step']
    
    return X, y, categorical_cols, numerical_cols

def create_preprocessing(numerical_cols, categorical_cols):
    # Numerical preprocessing pipeline
    numerical_pipeline = Pipeline([
        ('log_transform', FunctionTransformer(log_transform)),
        ('scaler', StandardScaler()),
        ('minmax', MinMaxScaler())
    ])
    
    # Categorical preprocessing pipeline
    categorical_pipeline = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_cols),
            ('cat', categorical_pipeline, categorical_cols)
        ])
    
    return preprocessor

def log_metrics(metrics_dict, prefix=""):
    """Helper function to log metrics to MLflow"""
    for metric_name, metric_value in metrics_dict.items():
        mlflow.log_metric(f"{prefix}{metric_name}", metric_value)

def evaluate_model(model, X, y):
    """Evaluate model and return metrics"""
    y_pred = model.predict(X)
    f1 = f1_score(y, y_pred)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    
    metrics = {
        'f1_score': f1,
        'true_positives': tp,
        'true_negatives': tn,
        'false_positives': fp,
        'false_negatives': fn
    }
    return metrics

def train_and_evaluate(df, n_cv_folds=3):
    with mlflow.start_run(run_name="fraud_detection_experiment"):
        # Prepare data
        X, y, categorical_cols, numerical_cols = prepare_data(df)
        
        # Create and log preprocessing steps
        preprocessor = create_preprocessing(numerical_cols, categorical_cols)
        
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', LogisticRegression(max_iter=1000))
        ])
        
        # Log pipeline parameters
        mlflow.log_param("n_cv_folds", n_cv_folds)
        mlflow.log_param("model_type", "LogisticRegression")
        
        # Perform cross-validation
        cv = KFold(n_splits=n_cv_folds, shuffle=True, random_state=42)
        cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1')
        
        # Log cross-validation results
        mlflow.log_metric("cv_f1_mean", cv_scores.mean())
        mlflow.log_metric("cv_f1_std", cv_scores.std())
        
        # Hyperparameter tuning
        param_grid = {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10],
            'classifier__class_weight': [None, 'balanced'],
            'classifier__solver': ['lbfgs', 'liblinear']
        }
        
        grid_search = GridSearchCV(
            pipeline,
            param_grid,
            cv=3,
            scoring='f1',
            n_jobs=-1
        )
        
        # Fit grid search
        grid_search.fit(X, y)
        
        # Log best parameters
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metric("best_cv_f1", grid_search.best_score_)
        
        # Evaluate on full training set
        best_model = grid_search.best_estimator_
        train_metrics = evaluate_model(best_model, X, y)
        log_metrics(train_metrics, "train_")
        
        # Log the model
        signature = infer_signature(X, y)
        mlflow.sklearn.log_model(best_model, "model", signature=signature)
        
        return {
            'cv_scores': cv_scores,
            'best_model': best_model,
            'best_params': grid_search.best_params_,
            'train_metrics': train_metrics
        }

# Run the pipeline with the existing DataFrame
results = train_and_evaluate(data)  # Changed df to data to match your variable name

# Print results
print("\nCross-validation Results:")
print(f"F1 Scores: {results['cv_scores']}")
print(f"Mean F1: {results['cv_scores'].mean():.3f}{results['cv_scores'].std():.3f})")

print("\nBest Hyperparameters:")
print(results['best_params'])

print("\nTraining Set Metrics:")
for metric_name, metric_value in results['train_metrics'].items():
    print(f"{metric_name}: {metric_value}")
🏃 View run fraud_detection_experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/6609795d9f2d44e284baadc26f0b28ad
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
Cell In[9], line 146
    138         return {
    139             'cv_scores': cv_scores,
    140             'best_model': best_model,
    141             'best_params': grid_search.best_params_,
    142             'train_metrics': train_metrics
    143         }
    145 # Run the pipeline with the existing DataFrame
--> 146 results = train_and_evaluate(data)  # Changed df to data to match your variable name
    148 # Print results
    149 print("\nCross-validation Results:")

Cell In[9], line 101, in train_and_evaluate(df, n_cv_folds)
     99 # Perform cross-validation
    100 cv = KFold(n_splits=n_cv_folds, shuffle=True, random_state=42)
--> 101 cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1')
    103 # Log cross-validation results
    104 mlflow.log_metric("cv_f1_mean", cv_scores.mean())

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:216, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    210 try:
    211     with config_context(
    212         skip_parameter_validation=(
    213             prefer_skip_nested_validation or global_skip_validation
    214         )
    215     ):
--> 216         return func(*args, **kwargs)
    217 except InvalidParameterError as e:
    218     # When the function is just a wrapper around an estimator, we allow
    219     # the function to delegate validation to the estimator, but we replace
    220     # the name of the estimator by the name of the function in the error
    221     # message to avoid confusion.
    222     msg = re.sub(
    223         r"parameter of \w+ must be",
    224         f"parameter of {func.__qualname__} must be",
    225         str(e),
    226     )

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:684, in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, params, pre_dispatch, error_score)
    681 # To ensure multimetric format is not supported
    682 scorer = check_scoring(estimator, scoring=scoring)
--> 684 cv_results = cross_validate(
    685     estimator=estimator,
    686     X=X,
    687     y=y,
    688     groups=groups,
    689     scoring={"score": scorer},
    690     cv=cv,
    691     n_jobs=n_jobs,
    692     verbose=verbose,
    693     params=params,
    694     pre_dispatch=pre_dispatch,
    695     error_score=error_score,
    696 )
    697 return cv_results["test_score"]

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/utils/_param_validation.py:216, in validate_params.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    210 try:
    211     with config_context(
    212         skip_parameter_validation=(
    213             prefer_skip_nested_validation or global_skip_validation
    214         )
    215     ):
--> 216         return func(*args, **kwargs)
    217 except InvalidParameterError as e:
    218     # When the function is just a wrapper around an estimator, we allow
    219     # the function to delegate validation to the estimator, but we replace
    220     # the name of the estimator by the name of the function in the error
    221     # message to avoid confusion.
    222     msg = re.sub(
    223         r"parameter of \w+ must be",
    224         f"parameter of {func.__qualname__} must be",
    225         str(e),
    226     )

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:411, in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, params, pre_dispatch, return_train_score, return_estimator, return_indices, error_score)
    408 # We clone the estimator to make sure that all the folds are
    409 # independent, and that it is pickle-able.
    410 parallel = Parallel(n_jobs=n_jobs, verbose=verbose, pre_dispatch=pre_dispatch)
--> 411 results = parallel(
    412     delayed(_fit_and_score)(
    413         clone(estimator),
    414         X,
    415         y,
    416         scorer=scorers,
    417         train=train,
    418         test=test,
    419         verbose=verbose,
    420         parameters=None,
    421         fit_params=routed_params.estimator.fit,
    422         score_params=routed_params.scorer.score,
    423         return_train_score=return_train_score,
    424         return_times=True,
    425         return_estimator=return_estimator,
    426         error_score=error_score,
    427     )
    428     for train, test in indices
    429 )
    431 _warn_or_raise_about_fit_failures(results, error_score)
    433 # For callable scoring, the return type is only know after calling. If the
    434 # return type is a dictionary, the error scores can now be inserted with
    435 # the correct key.

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/utils/parallel.py:77, in Parallel.__call__(self, iterable)
     72 config = get_config()
     73 iterable_with_config = (
     74     (_with_config(delayed_func, config), args, kwargs)
     75     for delayed_func, args, kwargs in iterable
     76 )
---> 77 return super().__call__(iterable_with_config)

File ~/Desktop/env/lib/python3.12/site-packages/joblib/parallel.py:1918, in Parallel.__call__(self, iterable)
   1916     output = self._get_sequential_output(iterable)
   1917     next(output)
-> 1918     return output if self.return_generator else list(output)
   1920 # Let's create an ID that uniquely identifies the current call. If the
   1921 # call is interrupted early and that the same instance is immediately
   1922 # re-used, this id will be used to prevent workers that were
   1923 # concurrently finalizing a task from the previous call to run the
   1924 # callback.
   1925 with self._lock:

File ~/Desktop/env/lib/python3.12/site-packages/joblib/parallel.py:1847, in Parallel._get_sequential_output(self, iterable)
   1845 self.n_dispatched_batches += 1
   1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
   1848 self.n_completed_tasks += 1
   1849 self.print_progress()

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/utils/parallel.py:139, in _FuncWrapper.__call__(self, *args, **kwargs)
    137     config = {}
    138 with config_context(**config):
--> 139     return self.function(*args, **kwargs)

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/model_selection/_validation.py:866, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, score_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    864         estimator.fit(X_train, **fit_params)
    865     else:
--> 866         estimator.fit(X_train, y_train, **fit_params)
    868 except Exception:
    869     # Note fit time as time until error
    870     fit_time = time.time() - start_time

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/base.py:1389, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1382     estimator._validate_params()
   1384 with config_context(
   1385     skip_parameter_validation=(
   1386         prefer_skip_nested_validation or global_skip_validation
   1387     )
   1388 ):
-> 1389     return fit_method(estimator, *args, **kwargs)

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/pipeline.py:660, in Pipeline.fit(self, X, y, **params)
    654     if self._final_estimator != "passthrough":
    655         last_step_params = self._get_metadata_for_step(
    656             step_idx=len(self) - 1,
    657             step_params=routed_params[self.steps[-1][0]],
    658             all_params=params,
    659         )
--> 660         self._final_estimator.fit(Xt, y, **last_step_params["fit"])
    662 return self

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/base.py:1389, in _fit_context.<locals>.decorator.<locals>.wrapper(estimator, *args, **kwargs)
   1382     estimator._validate_params()
   1384 with config_context(
   1385     skip_parameter_validation=(
   1386         prefer_skip_nested_validation or global_skip_validation
   1387     )
   1388 ):
-> 1389     return fit_method(estimator, *args, **kwargs)

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:1350, in LogisticRegression.fit(self, X, y, sample_weight)
   1347 else:
   1348     n_threads = 1
-> 1350 fold_coefs_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, prefer=prefer)(
   1351     path_func(
   1352         X,
   1353         y,
   1354         pos_class=class_,
   1355         Cs=[C_],
   1356         l1_ratio=self.l1_ratio,
   1357         fit_intercept=self.fit_intercept,
   1358         tol=self.tol,
   1359         verbose=self.verbose,
   1360         solver=solver,
   1361         multi_class=multi_class,
   1362         max_iter=self.max_iter,
   1363         class_weight=self.class_weight,
   1364         check_input=False,
   1365         random_state=self.random_state,
   1366         coef=warm_start_coef_,
   1367         penalty=penalty,
   1368         max_squared_sum=max_squared_sum,
   1369         sample_weight=sample_weight,
   1370         n_threads=n_threads,
   1371     )
   1372     for class_, warm_start_coef_ in zip(classes_, warm_start_coef)
   1373 )
   1375 fold_coefs_, _, n_iter_ = zip(*fold_coefs_)
   1376 self.n_iter_ = np.asarray(n_iter_, dtype=np.int32)[:, 0]

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/utils/parallel.py:77, in Parallel.__call__(self, iterable)
     72 config = get_config()
     73 iterable_with_config = (
     74     (_with_config(delayed_func, config), args, kwargs)
     75     for delayed_func, args, kwargs in iterable
     76 )
---> 77 return super().__call__(iterable_with_config)

File ~/Desktop/env/lib/python3.12/site-packages/joblib/parallel.py:1918, in Parallel.__call__(self, iterable)
   1916     output = self._get_sequential_output(iterable)
   1917     next(output)
-> 1918     return output if self.return_generator else list(output)
   1920 # Let's create an ID that uniquely identifies the current call. If the
   1921 # call is interrupted early and that the same instance is immediately
   1922 # re-used, this id will be used to prevent workers that were
   1923 # concurrently finalizing a task from the previous call to run the
   1924 # callback.
   1925 with self._lock:

File ~/Desktop/env/lib/python3.12/site-packages/joblib/parallel.py:1847, in Parallel._get_sequential_output(self, iterable)
   1845 self.n_dispatched_batches += 1
   1846 self.n_dispatched_tasks += 1
-> 1847 res = func(*args, **kwargs)
   1848 self.n_completed_tasks += 1
   1849 self.print_progress()

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/utils/parallel.py:139, in _FuncWrapper.__call__(self, *args, **kwargs)
    137     config = {}
    138 with config_context(**config):
--> 139     return self.function(*args, **kwargs)

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/linear_model/_logistic.py:451, in _logistic_regression_path(X, y, pos_class, Cs, fit_intercept, max_iter, tol, verbose, solver, coef, class_weight, dual, penalty, intercept_scaling, multi_class, random_state, check_input, max_squared_sum, sample_weight, l1_ratio, n_threads)
    447 l2_reg_strength = 1.0 / (C * sw_sum)
    448 iprint = [-1, 50, 1, 100, 101][
    449     np.searchsorted(np.array([0, 1, 2, 3]), verbose)
    450 ]
--> 451 opt_res = optimize.minimize(
    452     func,
    453     w0,
    454     method="L-BFGS-B",
    455     jac=True,
    456     args=(X, target, sample_weight, l2_reg_strength, n_threads),
    457     options={
    458         "maxiter": max_iter,
    459         "maxls": 50,  # default is 20
    460         "iprint": iprint,
    461         "gtol": tol,
    462         "ftol": 64 * np.finfo(float).eps,
    463     },
    464 )
    465 n_iter_i = _check_optimize_result(
    466     solver,
    467     opt_res,
    468     max_iter,
    469     extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG,
    470 )
    471 w0, loss = opt_res.x, opt_res.fun

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_minimize.py:713, in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    710     res = _minimize_newtoncg(fun, x0, args, jac, hess, hessp, callback,
    711                              **options)
    712 elif meth == 'l-bfgs-b':
--> 713     res = _minimize_lbfgsb(fun, x0, args, jac, bounds,
    714                            callback=callback, **options)
    715 elif meth == 'tnc':
    716     res = _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
    717                         **options)

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_lbfgsb_py.py:407, in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, finite_diff_rel_step, **unknown_options)
    401 task_str = task.tobytes()
    402 if task_str.startswith(b'FG'):
    403     # The minimization routine wants f and g at the current x.
    404     # Note that interruptions due to maxfun are postponed
    405     # until the completion of the current minimization iteration.
    406     # Overwrite f and g:
--> 407     f, g = func_and_grad(x)
    408 elif task_str.startswith(b'NEW_X'):
    409     # new iteration
    410     n_iterations += 1

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_differentiable_functions.py:296, in ScalarFunction.fun_and_grad(self, x)
    294 if not np.array_equal(x, self.x):
    295     self._update_x_impl(x)
--> 296 self._update_fun()
    297 self._update_grad()
    298 return self.f, self.g

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_differentiable_functions.py:262, in ScalarFunction._update_fun(self)
    260 def _update_fun(self):
    261     if not self.f_updated:
--> 262         self._update_fun_impl()
    263         self.f_updated = True

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_differentiable_functions.py:163, in ScalarFunction.__init__.<locals>.update_fun()
    162 def update_fun():
--> 163     self.f = fun_wrapped(self.x)

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_differentiable_functions.py:145, in ScalarFunction.__init__.<locals>.fun_wrapped(x)
    141 self.nfev += 1
    142 # Send a copy because the user may overwrite it.
    143 # Overwriting results in undefined behaviour because
    144 # fun(self.x) will change self.x, with the two no longer linked.
--> 145 fx = fun(np.copy(x), *args)
    146 # Make sure the function returns a true scalar
    147 if not np.isscalar(fx):

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_optimize.py:79, in MemoizeJac.__call__(self, x, *args)
     77 def __call__(self, x, *args):
     78     """ returns the function value """
---> 79     self._compute_if_needed(x, *args)
     80     return self._value

File ~/Desktop/env/lib/python3.12/site-packages/scipy/optimize/_optimize.py:73, in MemoizeJac._compute_if_needed(self, x, *args)
     71 if not np.all(x == self.x) or self._value is None or self.jac is None:
     72     self.x = np.asarray(x).copy()
---> 73     fg = self.fun(x, *args)
     74     self.jac = fg[1]
     75     self._value = fg[0]

File ~/Desktop/env/lib/python3.12/site-packages/sklearn/linear_model/_linear_loss.py:332, in LinearModelLoss.loss_gradient(self, coef, X, y, sample_weight, l2_reg_strength, n_threads, raw_prediction)
    330     grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
    331     if self.fit_intercept:
--> 332         grad[-1] = grad_pointwise.sum()
    333 else:
    334     grad = np.empty((n_classes, n_dof), dtype=weights.dtype, order="F")

File ~/Desktop/env/lib/python3.12/site-packages/numpy/_core/_methods.py:50, in _sum(a, axis, dtype, out, keepdims, initial, where)
     46 def _amin(a, axis=None, out=None, keepdims=False,
     47           initial=_NoValue, where=True):
     48     return umr_minimum(a, axis, None, out, keepdims, initial, where)
---> 50 def _sum(a, axis=None, dtype=None, out=None, keepdims=False,
     51          initial=_NoValue, where=True):
     52     return umr_sum(a, axis, dtype, out, keepdims, initial, where)
     54 def _prod(a, axis=None, dtype=None, out=None, keepdims=False,
     55           initial=_NoValue, where=True):

KeyboardInterrupt: 

Ridge Classifier and Random Forest#

from sklearn.linear_model import RidgeClassifier

# MLflow configuration
MLFLOW_TRACKING_URI = "https://dagshub.com/charankonduru2003/fraud_detection.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'charankonduru2003'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '0b22e21e2cfc7080ebc0fc65157efbff55fc501e'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("fraud_detection_experiment")

def prepare_data(df):
    # Separate features and target
    X = df.drop(['isFlaggedFraud', 'isFraud'], axis=1)
    y = df['isFraud']
    
    categorical_cols = ['type', 'nameOrig', 'nameDest']
    numerical_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                     'oldbalanceDest', 'newbalanceDest', 'step']
    
    return X, y, categorical_cols, numerical_cols

def create_preprocessing(numerical_cols, categorical_cols):
    numerical_pipeline = Pipeline([
        ('log_transform', FunctionTransformer(lambda x: np.log1p(np.abs(x)))),
        ('scaler', StandardScaler()),
        ('minmax', MinMaxScaler())
    ])
    
    categorical_pipeline = Pipeline([
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_pipeline, numerical_cols),
            ('cat', categorical_pipeline, categorical_cols)
        ])
    
    return preprocessor

def evaluate_model(model, X, y):
    """Evaluate model and return metrics"""
    y_pred = model.predict(X)
    f1 = f1_score(y, y_pred)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    
    metrics = {
        'f1_score': f1,
        'true_positives': tp,
        'true_negatives': tn,
        'false_positives': fp,
        'false_negatives': fn
    }
    return metrics

def get_classifiers():
    """Return dictionary of classifiers with their default parameters"""
    return {
        'LogisticRegression': LogisticRegression(max_iter=1000),
        'RidgeClassifier': RidgeClassifier(),
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42)
    }

def train_and_evaluate_models(df, n_cv_folds=3):
    # Prepare data
    X, y, categorical_cols, numerical_cols = prepare_data(df)
    
    # Create preprocessor
    preprocessor = create_preprocessing(numerical_cols, categorical_cols)
    
    # Get classifiers
    classifiers = get_classifiers()
    results = {}
    
    # Train and evaluate each classifier
    for clf_name, classifier in classifiers.items():
        with mlflow.start_run(run_name=f"fraud_detection_{clf_name}"):
            # Create pipeline
            pipeline = Pipeline([
                ('preprocessor', preprocessor),
                ('classifier', classifier)
            ])
            
            # Log basic parameters
            mlflow.log_param("model_type", clf_name)
            mlflow.log_param("n_cv_folds", n_cv_folds)
            
            # Perform cross-validation
            cv = KFold(n_splits=n_cv_folds, shuffle=True, random_state=42)
            cv_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='f1')
            
            # Log cross-validation results
            mlflow.log_metric("cv_f1_mean", cv_scores.mean())
            mlflow.log_metric("cv_f1_std", cv_scores.std())
            
            # Fit on full training data
            pipeline.fit(X, y)
            
            # Evaluate on full training set
            train_metrics = evaluate_model(pipeline, X, y)
            for metric_name, metric_value in train_metrics.items():
                mlflow.log_metric(f"train_{metric_name}", metric_value)
            
            # Log the model
            signature = infer_signature(X, y)
            mlflow.sklearn.log_model(pipeline, "model", signature=signature)
            
            # Store results
            results[clf_name] = {
                'cv_scores': cv_scores,
                'train_metrics': train_metrics,
                'model': pipeline
            }
    
    return results

# Run the multi-classifier pipeline
def print_results(results):
    print("\nResults Summary:")
    print("-" * 50)
    
    for clf_name, result in results.items():
        print(f"\n{clf_name}:")
        print(f"Cross-validation F1 Scores: {result['cv_scores']}")
        print(f"Mean CV F1: {result['cv_scores'].mean():.3f} (\u00b1{result['cv_scores'].std():.3f})")
        print("\nTraining Metrics:")
        for metric_name, metric_value in result['train_metrics'].items():
            print(f"{metric_name}: {metric_value}")
        print("-" * 50)

# Execute the pipeline
results = train_and_evaluate_models(df)
print_results(results)
🏃 View run fraud_detection_LogisticRegression at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/862dffde69d5495d869f87943a3cf455
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3
🏃 View run fraud_detection_RidgeClassifier at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/4de295ec2ebd40ae998ec16a7780d477
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3
🏃 View run fraud_detection_RandomForest at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/f06b42821b0c4694b0b84d97256c9773
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3

Results Summary:
--------------------------------------------------

LogisticRegression:
Cross-validation F1 Scores: [0.94095172 0.94128307 0.94607143]
Mean CV F1: 0.943 (±0.002)

Training Metrics:
f1_score: 0.985262902541088
true_positives: 8123
true_negatives: 16149
false_positives: 153
false_negatives: 90
--------------------------------------------------

RidgeClassifier:
Cross-validation F1 Scores: [0.89221655 0.88968457 0.8915703 ]
Mean CV F1: 0.891 (±0.001)

Training Metrics:
f1_score: 1.0
true_positives: 8213
true_negatives: 16302
false_positives: 0
false_negatives: 0
--------------------------------------------------

RandomForest:
Cross-validation F1 Scores: [0.96638504 0.96798493 0.96688988]
Mean CV F1: 0.967 (±0.001)

Training Metrics:
f1_score: 1.0
true_positives: 8213
true_negatives: 16302
false_positives: 0
false_negatives: 0
--------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV
import mlflow
from mlflow.models import infer_signature
import os
import warnings
warnings.filterwarnings('ignore')

# MLflow configuration
MLFLOW_TRACKING_URI = "https://dagshub.com/charankonduru2003/fraud_detection.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'charankonduru2003'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '0b22e21e2cfc7080ebc0fc65157efbff55fc501e'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("fraud_detection")

# Print column names and their indices
column_indices = {
    'step': 0,
    'amount': 1,
    'oldbalanceOrg': 2,
    'newbalanceOrig': 3,
    'oldbalanceDest': 4,
    'newbalanceDest': 5
}

def create_balance_diff(X):
    """Calculate balance differences and ratios"""
    # Convert to numpy array if not already
    X = np.array(X)
    
    # Calculate differences
    balance_diff_orig = X[:, 3] - X[:, 2]  # newbalanceOrig - oldbalanceOrg
    balance_diff_dest = X[:, 5] - X[:, 4]  # newbalanceDest - oldbalanceDest
    
    # Calculate ratios with zero handling
    balance_ratio_orig = np.where(X[:, 2] != 0,  # oldbalanceOrg
                                X[:, 3] / X[:, 2],  # newbalanceOrig / oldbalanceOrg
                                0)
    
    balance_ratio_dest = np.where(X[:, 4] != 0,  # oldbalanceDest
                                X[:, 5] / X[:, 4],  # newbalanceDest / oldbalanceDest
                                0)
    
    return np.column_stack((balance_diff_orig, balance_diff_dest, 
                           balance_ratio_orig, balance_ratio_dest))

def create_amount_features(X):
    """Create amount-related features"""
    # Convert to numpy array if not already
    X = np.array(X)
    
    # Amount to balance ratios
    amount_to_oldbalance_ratio = np.where(X[:, 2] != 0,  # oldbalanceOrg
                                         X[:, 1] / X[:, 2],  # amount / oldbalanceOrg
                                         0)
    
    amount_to_newbalance_ratio = np.where(X[:, 3] != 0,  # newbalanceOrig
                                         X[:, 1] / X[:, 3],  # amount / newbalanceOrig
                                         0)
    
    # Step-amount ratio
    step_amount_ratio = X[:, 1] / (X[:, 0] + 1)  # amount / (step + 1)
    
    return np.column_stack((amount_to_oldbalance_ratio, 
                           amount_to_newbalance_ratio,
                           step_amount_ratio))

# Start MLflow run
with mlflow.start_run(run_name="feature_engineering_experiment"):
    
    # Define features and target using existing df
    X = df.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
    y = df['isFraud']
    
    # Create the preprocessing pipeline with correct column names
    numeric_features = ['step', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 
                       'oldbalanceDest', 'newbalanceDest']
    categorical_features = ['type']
    
    # Create custom transformers
    balance_transformer = FunctionTransformer(create_balance_diff, validate=False)
    amount_transformer = FunctionTransformer(create_amount_features, validate=False)
    
    # Create preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
        ])
    
    # Create the full pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('balance_features', balance_transformer),
        ('amount_features', amount_transformer),
        ('classifier', LogisticRegression(class_weight='balanced', random_state=42))
    ])
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Print the first few rows of X_train to verify the structure
    print("X_train first few rows:")
    print(X_train.head())
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("scaler", "StandardScaler")
    mlflow.log_param("balance_features", "True")
    mlflow.log_param("amount_features", "True")
    mlflow.log_param("class_weight", "balanced")
    
    # Log metrics
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("true_negatives", conf_matrix[0][0])
    mlflow.log_metric("false_positives", conf_matrix[0][1])
    mlflow.log_metric("false_negatives", conf_matrix[1][0])
    mlflow.log_metric("true_positives", conf_matrix[1][1])
    
    # Log the model
    signature = infer_signature(X_train, y_train)
    mlflow.sklearn.log_model(pipeline, "model", signature=signature)
    
    print(f"F1 Score: {f1}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nModel and metrics have been logged to MLflow")
X_train first few rows:
       step      type      amount  oldbalanceOrg  newbalanceOrig  \
4881    136   CASH_IN   109848.79     4192711.26      4302560.05   
12288   268  CASH_OUT  2219475.13     2219475.13            0.00   
17105   356  CASH_OUT    59939.88         260.00            0.00   
4067    116  TRANSFER  2066848.27     2066848.27            0.00   
2295     37   CASH_IN      610.91     1444486.47      1445097.38   

       oldbalanceDest  newbalanceDest  
4881        980626.88       870778.09  
12288       589145.32      2808620.45  
17105            0.00        59939.88  
4067             0.00            0.00  
2295      11258719.38     11517636.74  
F1 Score: 0.19842053307008883

Confusion Matrix:
[[3078  182]
 [1442  201]]

Model and metrics have been logged to MLflow
🏃 View run feature_engineering_experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/1/runs/5189318cf3ec4c7bbc85672e19f98ea3
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/1

Feature Selection#

import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import mlflow
import warnings
warnings.filterwarnings('ignore')

def prepare_data(df):
    """Prepare data for feature selection"""
    # Drop identifier and target leakage columns
    columns_to_drop = ['step', 'nameOrig', 'nameDest', 'isFlaggedFraud', 'isFraud']
    
    # Convert type to dummy variables
    X = pd.get_dummies(df.drop(columns_to_drop, axis=1), columns=['type'])
    y = df['isFraud']
    
    return X, y

def correlation_threshold_selection(X, threshold=0.7):
    """Select features based on correlation threshold"""
    # Calculate correlation matrix
    corr_matrix = X.corr().abs()
    
    # Get upper triangle of correlations
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features to drop
    to_drop = [column for column in upper.columns if any(upper[column] > threshold)]
    
    # Keep remaining features
    selected_features = [col for col in X.columns if col not in to_drop]
    
    # Create correlation pairs for logging
    corr_pairs = []
    for i in range(len(X.columns)):
        for j in range(i+1, len(X.columns)):
            if corr_matrix.iloc[i,j] > threshold:
                corr_pairs.append({
                    'feature1': X.columns[i],
                    'feature2': X.columns[j],
                    'correlation': corr_matrix.iloc[i,j]
                })
    
    return selected_features, corr_pairs

def variance_threshold_selection(X, threshold=0.01):
    """Select features based on variance threshold"""
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Apply variance threshold
    selector = VarianceThreshold(threshold=threshold)
    selector.fit(X_scaled)
    
    # Get variance for each feature
    feature_variances = dict(zip(X.columns, selector.variances_))
    
    # Get selected features
    selected_features = X.columns[selector.get_support()].tolist()
    
    return selected_features, feature_variances

def feature_importance_selection(X, y, threshold=0.01):
    """Select features based on importance threshold"""
    # Train Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    
    # Get feature importance
    importances = dict(zip(X.columns, rf.feature_importances_))
    
    # Select features above threshold
    selected_features = [feature for feature, importance in importances.items() 
                        if importance > threshold]
    
    return selected_features, importances

def run_feature_selection():
    with mlflow.start_run(run_name="feature_selection_comparison"):
        # Prepare data
        X, y = prepare_data(df)
        mlflow.log_param("original_features_count", X.shape[1])
        
        # 1. Correlation Threshold Selection
        corr_selected, corr_pairs = correlation_threshold_selection(X, threshold=0.7)
        mlflow.log_param("correlation_threshold", 0.7)
        mlflow.log_param("correlation_selected_count", len(corr_selected))
        mlflow.log_dict({"correlation_pairs": corr_pairs}, "correlation_pairs.json")
        mlflow.log_dict({"correlation_selected_features": corr_selected}, 
                       "correlation_selected.json")
        
        # 2. Variance Threshold Selection
        var_selected, feature_variances = variance_threshold_selection(X, threshold=0.01)
        mlflow.log_param("variance_threshold", 0.01)
        mlflow.log_param("variance_selected_count", len(var_selected))
        mlflow.log_dict({"feature_variances": feature_variances}, 
                       "feature_variances.json")
        mlflow.log_dict({"variance_selected_features": var_selected}, 
                       "variance_selected.json")
        
        # 3. Feature Importance Selection
        importance_selected, feature_importances = feature_importance_selection(X, y, 
                                                                             threshold=0.01)
        mlflow.log_param("importance_threshold", 0.01)
        mlflow.log_param("importance_selected_count", len(importance_selected))
        mlflow.log_dict({"feature_importances": feature_importances}, 
                       "feature_importances.json")
        mlflow.log_dict({"importance_selected_features": importance_selected}, 
                       "importance_selected.json")
        
        # Find common features across all methods
        common_features = list(set(corr_selected) & 
                             set(var_selected) & 
                             set(importance_selected))
        mlflow.log_param("common_features_count", len(common_features))
        mlflow.log_dict({"common_features": common_features}, "common_features.json")
        
        # Create summary metrics for each feature
        feature_summary = {}
        for feature in X.columns:
            feature_summary[feature] = {
                'selected_by_correlation': feature in corr_selected,
                'selected_by_variance': feature in var_selected,
                'selected_by_importance': feature in importance_selected,
                'selected_by_all_methods': feature in common_features,
                'variance': feature_variances[feature],
                'importance': feature_importances[feature]
            }
        
        mlflow.log_dict(feature_summary, "feature_summary.json")
        
        return {
            'correlation_selected': corr_selected,
            'variance_selected': var_selected,
            'importance_selected': importance_selected,
            'common_features': common_features,
            'feature_summary': feature_summary
        }

# Run the feature selection process
results = run_feature_selection()

# Print summary of results
print("\nFeature Selection Summary:")
print(f"Original features: {len(results['feature_summary'].keys())}")
print(f"Features selected by correlation: {len(results['correlation_selected'])}")
print(f"Features selected by variance: {len(results['variance_selected'])}")
print(f"Features selected by importance: {len(results['importance_selected'])}")
print(f"Features selected by all methods: {len(results['common_features'])}")
🏃 View run feature_selection_comparison at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/1dab8b45a50e43e196929f046a1a923e
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3

Feature Selection Summary:
Original features: 9
Features selected by correlation: 7
Features selected by variance: 9
Features selected by importance: 9
Features selected by all methods: 7
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, precision_score, recall_score
import mlflow
import matplotlib.pyplot as plt
import seaborn as sns
import json

def prepare_data(df):
    # Drop identifier columns and target leakage columns
    columns_to_drop = ['step', 'nameOrig', 'nameDest', 'isFlaggedFraud']
    X = df.drop(columns_to_drop + ['isFraud'], axis=1)
    y = df['isFraud']
    
    # Convert categorical features
    X = pd.get_dummies(X, columns=['type'])
    
    return X, y

def perform_pca_analysis(X, variance_threshold=0.95):
    # Scale the features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Perform PCA
    pca = PCA()
    pca_result = pca.fit_transform(X_scaled)
    
    # Calculate cumulative explained variance
    cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)
    
    # Find number of components for threshold
    n_components = np.argmax(cumulative_variance_ratio >= variance_threshold) + 1
    
    return pca, pca_result, n_components

def evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    return {
        'f1': f1_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred)
    }

def run_pca_experiment(df):
    with mlflow.start_run(run_name="pca_dimensionality_reduction"):
        # Prepare data
        X, y = prepare_data(df)
        mlflow.log_param("original_features", X.shape[1])
        
        # Perform PCA
        pca, pca_result, n_components = perform_pca_analysis(X)
        
        # Log explained variance ratios
        explained_variance_data = {
            'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
            'cumulative_variance_ratio': np.cumsum(pca.explained_variance_ratio_).tolist()
        }
        mlflow.log_dict(explained_variance_data, "explained_variance.json")
        
        # Log number of selected components
        mlflow.log_param("selected_components", n_components)
        mlflow.log_metric("variance_explained", 
                         np.sum(pca.explained_variance_ratio_[:n_components]))
        
        # Create reduced dataset
        pca_reduced = PCA(n_components=n_components)
        X_reduced = pca_reduced.fit_transform(StandardScaler().fit_transform(X))
        
        # Evaluate models
        original_metrics = evaluate_model(X, y)
        reduced_metrics = evaluate_model(X_reduced, y)
        
        # Log metrics
        for metric, value in original_metrics.items():
            mlflow.log_metric(f"original_{metric}", value)
        
        for metric, value in reduced_metrics.items():
            mlflow.log_metric(f"pca_reduced_{metric}", value)
        
        # Log component loadings
        component_loadings = pd.DataFrame(
            pca.components_[:n_components].T,
            columns=[f'PC{i+1}' for i in range(n_components)],
            index=X.columns
        )
        mlflow.log_dict(component_loadings.to_dict(), "component_loadings.json")
        
        return {
            'n_components': n_components,
            'explained_variance_ratio': pca.explained_variance_ratio_.tolist(),
            'metrics': {
                'original': original_metrics,
                'reduced': reduced_metrics
            }
        }

# Run the experiment
results = run_pca_experiment(df)
🏃 View run pca_dimensionality_reduction at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/69547b88b0a246df88a26f14f1a5ed22
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3

Feature Engineering#

import pandas as pd
import numpy as np
import mlflow

def create_transaction_features(df):
    """Create new features from transaction-related attributes"""
    features = pd.DataFrame()
    
    # Transaction amount patterns
    features['amount_per_step'] = df['amount'] / (df['step'] + 1)
    features['transaction_velocity'] = df['amount'] / df.groupby('step')['amount'].transform('mean')
    
    return features

def create_balance_features(df):
    """Create new features from balance-related attributes"""
    features = pd.DataFrame()
    
    # Balance differences
    features['orig_balance_diff'] = df['newbalanceOrig'] - df['oldbalanceOrg']
    features['dest_balance_diff'] = df['newbalanceDest'] - df['oldbalanceDest']
    
    # Balance ratios
    features['orig_balance_ratio'] = np.where(df['oldbalanceOrg'] != 0,
                                            df['newbalanceOrig'] / df['oldbalanceOrg'],
                                            0)
    features['dest_balance_ratio'] = np.where(df['oldbalanceDest'] != 0,
                                            df['newbalanceDest'] / df['oldbalanceDest'],
                                            0)
    
    # Amount to balance ratios
    features['amount_to_orig_old_balance'] = np.where(df['oldbalanceOrg'] != 0,
                                                     df['amount'] / df['oldbalanceOrg'],
                                                     0)
    features['amount_to_dest_old_balance'] = np.where(df['oldbalanceDest'] != 0,
                                                     df['amount'] / df['oldbalanceDest'],
                                                     0)
    
    # Balance changes relative to amount
    features['orig_balance_change_ratio'] = features['orig_balance_diff'] / (df['amount'] + 1e-8)  # Added small constant to avoid division by zero
    features['dest_balance_change_ratio'] = features['dest_balance_diff'] / (df['amount'] + 1e-8)
    
    return features

def run_feature_engineering():
    with mlflow.start_run(run_name="feature_engineering_experiment"):
        # Create engineered features
        transaction_features = create_transaction_features(df)
        balance_features = create_balance_features(df)
        
        # Combine all features
        engineered_features = pd.concat([
            transaction_features,
            balance_features
        ], axis=1)
        
        # Log feature names and descriptions
        feature_descriptions = {
            'Transaction Features': {
                'amount_per_step': 'Amount normalized by step',
                'transaction_velocity': 'Transaction amount relative to step average'
            },
            'Balance Features': {
                'orig_balance_diff': 'Change in origin account balance',
                'dest_balance_diff': 'Change in destination account balance',
                'orig_balance_ratio': 'Ratio of new to old origin balance',
                'dest_balance_ratio': 'Ratio of new to old destination balance',
                'amount_to_orig_old_balance': 'Transaction amount relative to origin balance',
                'amount_to_dest_old_balance': 'Transaction amount relative to destination balance',
                'orig_balance_change_ratio': 'Balance change relative to transaction amount (origin)',
                'dest_balance_change_ratio': 'Balance change relative to transaction amount (destination)'
            }
        }
        
        # Log feature information
        mlflow.log_dict(feature_descriptions, "feature_descriptions.json")
        
        # Log feature statistics
        feature_stats = engineered_features.describe().to_dict()
        mlflow.log_dict(feature_stats, "feature_statistics.json")
        
        # Log number of features created
        mlflow.log_param("total_engineered_features", len(engineered_features.columns))
        mlflow.log_param("transaction_features", len(transaction_features.columns))
        mlflow.log_param("balance_features", len(balance_features.columns))
        
        # Log feature correlation with target
        target_correlations = {}
        for col in engineered_features.columns:
            correlation = engineered_features[col].corr(df['isFraud'])
            target_correlations[col] = correlation
            mlflow.log_metric(f"correlation_{col}", correlation)
        
        mlflow.log_dict(target_correlations, "target_correlations.json")
        
        return engineered_features

# Run the feature engineering process
engineered_features = run_feature_engineering()

# Display the first few rows of the engineered features
print("\nEngineered Features Preview:")
print(engineered_features.head())

# Display the shape of the engineered features
print("\nEngineered Features Shape:", engineered_features.shape)
🏃 View run feature_engineering_experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/f7ced10ec2b24d76ba6161b2e8c2bbe9
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3

Engineered Features Preview:
   amount_per_step  transaction_velocity  orig_balance_diff  \
0        66421.320              0.719277           -4499.08   
1           90.500              0.000980            -181.00   
2       208000.665              2.252441               0.00   
3          957.715              0.010371           -1915.43   
4           90.500              0.000980            -181.00   

   dest_balance_diff  orig_balance_ratio  dest_balance_ratio  \
0          132842.64            0.000000                0.00   
1               0.00            0.000000                0.00   
2         9291517.62            0.000000            91094.31   
3               0.00            0.832714                0.00   
4          -21182.00            0.000000                0.00   

   amount_to_orig_old_balance  amount_to_dest_old_balance  \
0                   29.526623                    0.000000   
1                    1.000000                    0.000000   
2                    0.000000                 4078.444412   
3                    0.167286                    0.000000   
4                    1.000000                    0.008545   

   orig_balance_change_ratio  dest_balance_change_ratio  
0                  -0.033868                   1.000000  
1                  -1.000000                   0.000000  
2                   0.000000                  22.335307  
3                  -1.000000                   0.000000  
4                  -1.000000                -117.027624  

Engineered Features Shape: (24515, 10)

Custom Experiment#

from sklearn.preprocessing import RobustScaler
def calculate_transaction_patterns(X):
    """Calculate transaction patterns and risk indicators"""
    patterns = X.copy()
    
    # Calculate balance changes (fixed column names)
    patterns['balance_change_orig'] = patterns['newbalanceOrig'] - patterns['oldbalanceOrg']
    patterns['balance_change_dest'] = patterns['newbalanceDest'] - patterns['oldbalanceDest']
    
    # Calculate transaction to balance ratios
    patterns['amount_to_oldbal_ratio'] = np.where(
        patterns['oldbalanceOrg'] > 0,
        patterns['amount'] / patterns['oldbalanceOrg'],
        999  # High value for zero balance
    )
    
    # Flag suspicious patterns
    patterns['zero_balance_transfer'] = ((patterns['oldbalanceOrg'] == 0) & 
                                       (patterns['newbalanceOrig'] == 0)).astype(int)
    
    # Transaction type encoding
    patterns['is_transfer'] = (patterns['type'] == 'TRANSFER').astype(int)
    patterns['is_cash_out'] = (patterns['type'] == 'CASH_OUT').astype(int)
    patterns['is_payment'] = (patterns['type'] == 'PAYMENT').astype(int)
    
    return patterns[['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                    'oldbalanceDest', 'newbalanceDest',
                    'balance_change_orig', 'balance_change_dest',
                    'amount_to_oldbal_ratio', 'zero_balance_transfer',
                    'is_transfer', 'is_cash_out', 'is_payment']]

# Custom metric that combines precision and recall with higher weight on recall
def weighted_f1_score(y_true, y_pred, beta=2):
    """Calculate F-beta score with higher weight on recall"""
    from sklearn.metrics import fbeta_score
    return fbeta_score(y_true, y_pred, beta=beta)

# Start MLflow run
with mlflow.start_run(run_name="custom_fraud_detection_experiment") as run:
    # Feature engineering
    feature_engineering = FunctionTransformer(calculate_transaction_patterns)
    
    # Define all features
    all_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                   'oldbalanceDest', 'newbalanceDest',
                   'balance_change_orig', 'balance_change_dest',
                   'amount_to_oldbal_ratio', 'zero_balance_transfer',
                   'is_transfer', 'is_cash_out', 'is_payment']
    
    # Create preprocessing pipeline
    numeric_transformer = Pipeline(steps=[
        ('scaler', RobustScaler())
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, all_features)
        ],
        remainder='drop'
    )
    
    # Create full pipeline
    model = Pipeline([
        ('feature_engineering', feature_engineering),
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000))
    ])
    
    # Split data
    X = df.drop('isFraud', axis=1)
    y = df['isFraud']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    standard_f1 = f1_score(y_test, y_pred)
    weighted_f1 = weighted_f1_score(y_test, y_pred, beta=2)
    
    # Log parameters
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("feature_engineering", "transaction_patterns")
    mlflow.log_param("scaler", "RobustScaler")
    
    # Log metrics
    mlflow.log_metric("standard_f1", standard_f1)
    mlflow.log_metric("weighted_f1", weighted_f1)
    
    # Log feature importance
    coef_df = pd.DataFrame(
        model.named_steps['classifier'].coef_[0],
        index=all_features,
        columns=['coefficient']
    ).sort_values('coefficient', ascending=False)
    
    # Save feature importance plot
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    coef_df['coefficient'].plot(kind='bar')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    
    # Log the feature importance plot
    mlflow.log_artifact('feature_importance.png')
    
    # Log the model
    signature = infer_signature(X_train, y_pred)
    mlflow.sklearn.log_model(model, "fraud_detection_model", signature=signature)
    
    # Print results
    print(f"Standard F1 Score: {standard_f1:.3f}")
    print(f"Weighted F1 Score (β=2): {weighted_f1:.3f}")
    print("\nTop 5 Most Important Features:")
    print(coef_df.head())
Standard F1 Score: 0.929
Weighted F1 Score (β=2): 0.960

Top 5 Most Important Features:
                       coefficient
is_transfer               6.054947
is_cash_out               4.537580
zero_balance_transfer     3.880630
oldbalanceOrg             1.064719
newbalanceDest            0.025432
🏃 View run custom_fraud_detection_experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/461c5d2d8026445587464ddfb9f916d6
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3
_images/c3edb6c4a80b699dddfd45ad083a06a75c4d027988cb8d74030f1660f60218d0.png
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn
from sklearn.base import BaseEstimator, TransformerMixin
import matplotlib.pyplot as plt

class AdvancedFraudFeatures(BaseEstimator, TransformerMixin):
    """Custom transformer for advanced fraud detection features"""
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        features = X.copy()
        
        # Advanced transaction patterns
        features['transaction_velocity'] = features['amount'] / (features['oldbalanceOrg'] + features['oldbalanceDest'] + 1)
        features['balance_drain_ratio'] = np.where(
            features['oldbalanceOrg'] > 0,
            (features['oldbalanceOrg'] - features['newbalanceOrig']) / features['oldbalanceOrg'],
            1
        )
        
        # Suspicious patterns
        features['complete_drain'] = ((features['oldbalanceOrg'] > 0) & 
                                    (features['newbalanceOrig'] == 0)).astype(int)
        features['large_transfer'] = (features['amount'] > features['oldbalanceOrg'] * 0.9).astype(int)
        
        # Destination account patterns
        features['dest_balance_increase'] = (features['newbalanceDest'] > 
                                           features['oldbalanceDest'] * 1.5).astype(int)
        
        return features

# Start MLflow run
with mlflow.start_run(run_name="advanced_fraud_detection_v5") as run:
    # Define feature columns
    numeric_features = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 
                       'oldbalanceDest', 'newbalanceDest']
    categorical_features = ['type']
    
    # Create preprocessing pipeline
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_features),
            ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_features)
        ]
    )
    
    # Create model pipeline
    model = Pipeline([
        ('feature_engineering', AdvancedFraudFeatures()),
        ('preprocessor', preprocessor),
        ('classifier', GradientBoostingClassifier(
            n_estimators=200,
            learning_rate=0.1,
            max_depth=5,
            subsample=0.8,
            random_state=42
        ))
    ])
    
    # Split data
    X = data.drop('isFraud', axis=1)
    y = data['isFraud']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    # Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    f1 = f1_score(y_test, y_pred)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    pr_auc = auc(recall, precision)
    
    # Log parameters and metrics
    mlflow.log_params({
        "model_type": "GradientBoostingClassifier",
        "n_estimators": 200,
        "learning_rate": 0.1,
        "max_depth": 5,
        "subsample": 0.8
    })
    
    mlflow.log_metrics({
        "f1_score": f1,
        "pr_auc": pr_auc
    })
    
    # Create and log PR curve
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, label=f'PR curve (AUC = {pr_auc:.2f})')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.grid(True)
    plt.savefig('pr_curve.png')
    mlflow.log_artifact('pr_curve.png')
    plt.close()
    
    # Get feature names after pipeline transformation
    # First, get the feature names from custom transformer
    engineered_feature_names = [
        'transaction_velocity', 'balance_drain_ratio', 
        'complete_drain', 'large_transfer', 
        'dest_balance_increase'
    ]
    
    # Get categorical feature names
    categorical_encoder = model.named_steps['preprocessor'].named_transformers_['cat']
    categorical_feature_names = categorical_encoder.get_feature_names_out(categorical_features).tolist()
    
    # Combine all feature names in the correct order
    final_feature_names = numeric_features + engineered_feature_names + categorical_feature_names
    
    # Get feature importances
    importances = model.named_steps['classifier'].feature_importances_
    
    # Debug print
    print(f"Number of feature names: {len(final_feature_names)}")
    print(f"Number of importance values: {len(importances)}")
    print("Feature names:", final_feature_names)
    
    # Create feature importance dataframe only if lengths match
    if len(final_feature_names) == len(importances):
        importance_df = pd.DataFrame({
            'feature': final_feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        # Create and log feature importance plot
        plt.figure(figsize=(12, 6))
        plt.bar(range(len(importance_df)), importance_df['importance'])
        plt.xticks(range(len(importance_df)), importance_df['feature'], rotation=45, ha='right')
        plt.title('Feature Importance')
        plt.tight_layout()
        plt.savefig('feature_importance.png')
        mlflow.log_artifact('feature_importance.png')
        plt.close()
        
        # Print results
        print(f"\nF1 Score: {f1:.3f}")
        print(f"PR AUC Score: {pr_auc:.3f}")
        print("\nTop 5 Most Important Features:")
        print(importance_df.head())
    else:
        print("\nError: Number of features doesn't match number of importance values")
        print("Skipping feature importance visualization")
    
    # Log the model
    mlflow.sklearn.log_model(model, "custom_experiment2")
🏃 View run advanced_fraud_detection_v5 at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3/runs/526bc986e29244c5b00b82dd112aac9c
🧪 View experiment at: https://dagshub.com/charankonduru2003/fraud_detection.mlflow/#/experiments/3
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[45], line 70
     57 model = Pipeline([
     58     ('feature_engineering', AdvancedFraudFeatures()),
     59     ('preprocessor', preprocessor),
   (...)
     66     ))
     67 ])
     69 # Split data
---> 70 X = data.drop('isFraud', axis=1)
     71 y = data['isFraud']
     72 X_train, X_test, y_train, y_test = train_test_split(
     73     X, y, test_size=0.2, stratify=y, random_state=42
     74 )

AttributeError: 'list' object has no attribute 'drop'

F1 scores plot#

import mlflow
import mlflow.tracking
import pandas as pd
import matplotlib.pyplot as plt

client = mlflow.tracking.MlflowClient()

experiments = client.search_experiments()
runs = []

# Collect run data from all experiments
for exp in experiments:
    experiment_id = exp.experiment_id
    runs += client.search_runs(experiment_id)

data = []
f1_score_keys = ['f1_score', 'train_f1_score', 'standard_f1']

# Extract F1 scores from different keys
for run in runs:
    for key in f1_score_keys:
        if key in run.data.metrics:
            data.append({
                'experiment_name': run.data.tags.get('mlflow.runName'),
                'f1_score': run.data.metrics[key],
                'f1_type': key  # To track which type of F1 was logged
            })
            
df = pd.DataFrame(data)
plt.figure(figsize=(12, 6))

# Plot each F1 type with a different color
for f1_type in df['f1_type'].unique():
    subset = df[df['f1_type'] == f1_type]
    plt.bar(subset['experiment_name'], subset['f1_score'], label=f1_type)

plt.xlabel('Experiment')
plt.ylabel('F1 Score')
plt.title('F1 Score Comparison Across Experiments')
plt.xticks(rotation=15)
plt.ylim(0.7, 1)  # Adjust if necessary
plt.legend(title="F1 Type")
plt.grid(axis='y')
plt.show()
_images/065a1835b2de722862951b0f4268b358cf4d8a8b379c7bf4abb6b0d3abed5c75.png